library(httr)
library(curl)
library(ggplot2)
library(plotly)
library(plyr)
library(stringr)
library(sqldf)
library(dplyr)
library(tidytext)
library(tidyverse)
library(gganimate)
library(readxl)
library(rvest)
library(lubridate)
library(stringi)
library(rvest)
library(jsonlite)
library(choroplethr)
library(choroplethrMaps)
library(rlang)
library(topicmodels)

The usecase numbers are aligned with the slides presentation.

library(readxl)
library(dplyr)
kaggle_data <-readxl::read_excel("athlete_events.xlsx")
#Usecase 1 : Growth in the number of sports and events in Summer & Winter Games – Kaggle Data range from 1896 to 2016
Usecase_1 <- kaggle_data %>% distinct(Year,Season,Sport)
Usecase_1 <- Usecase_1 %>% select(Year,Season) %>% dplyr::arrange(Year,Season) %>% dplyr::count(Year,Season) %>% dplyr::rename(freq=n)
plot <- plot_ly(data = Usecase_1, x = ~Year, y = ~freq, type = 'scatter',
        marker = list(size = 10,
                      color = 'green',
                      line = list(color = 'orange',
                                  width = 2)))
plot <- plot_ly() %>%
  add_trace( data = dplyr::filter(Usecase_1,Season == "Summer"),
             x = ~Year, y = ~freq, type = 'scatter', 
             name = 'Summer Sports',
             marker = list(size = 10,
                           color = 'red',
                           line = list(color = 'green',
                                       width = 2))) %>%
  add_trace( data = dplyr::filter(Usecase_1,Season == "Winter"),
             x = ~Year, y = ~freq, type = 'scatter', 
             name = 'Winert Sports',
             marker = list(size = 10,
                           color = 'yellow',
                           line = list(color = 'green',
                                       width = 2)))
plot <- plot %>% layout(title = 'Number of Summer Vs Winter sport Olympics since 1920',
                      plot_bgcolor = "#e5ecf6",scale_fill_brewer(palette = "RdYlGn"),
                      yaxis = list(zeroline = FALSE, title = "Year of Olympics"),
                      xaxis = list(zeroline = FALSE, title = "Number of Sports"),
                      legend = list(title=list(text='Legend Title')))
ggplotly(plot)
#From the plot we can see that the growth in the number of winter sports is very less when compared to the summer sports, and this is due to weather, ease of hosting and the audience turn out not being on the side of winter sports
#2. Gender Participation in Olympics – Kaggle Data range from 1896 to 2016
data(df_pop_country)
kaggle_data <-  readxl::read_excel("athlete_events.xlsx")
Usecase_2 <- kaggle_data %>% select(Team,Sex) 
Usecase_2 %>% filter(Team == "united states of america")
## # A tibble: 0 × 2
## # … with 2 variables: Team <chr>, Sex <chr>
Usecase_2 <- Usecase_2 %>% 
  mutate(Team = str_extract(Team,"[a-zA-Z' ']+")) %>% 
  mutate(Team = str_to_lower(Team,"[a-zA-Z]+"))
Usecase_2$Team <- str_replace(Usecase_2$Team, "united states","united states of america")
Usecase_2 %>% filter(Team == "united states of america")
## # A tibble: 18,475 × 2
##    Team                     Sex  
##    <chr>                    <chr>
##  1 united states of america M    
##  2 united states of america M    
##  3 united states of america M    
##  4 united states of america M    
##  5 united states of america M    
##  6 united states of america M    
##  7 united states of america M    
##  8 united states of america M    
##  9 united states of america M    
## 10 united states of america M    
## # … with 18,465 more rows
#2. Nations (in terms of numbers) participating in Olympics to identify gender distribution 
result_df <- sqldf("select Team as region, Sex, count(Sex) as value from Usecase_2 Group by Team,Sex")
Usecase_2 <- as_tibble(result_df)
Usecase_2_M <- Usecase_2 %>% filter(Sex == 'M') %>% select(1,3)
Usecase_2_M <- inner_join(df_pop_country,Usecase_2_M, by = 'region') %>% select(1,3) 
Usecase_2_M <- as_tibble(Usecase_2_M)
Usecase_2_M <- dplyr::rename(Usecase_2_M,value = value.y)
head(Usecase_2_M)
## # A tibble: 6 × 2
##   region               value
##   <chr>                <int>
## 1 united arab emirates   137
## 2 afghanistan            121
## 3 albania                 43
## 4 armenia                183
## 5 angola                 138
## 6 argentina             2590
Usecase_2_M %>% filter(region == "china")
## # A tibble: 1 × 2
##   region value
##   <chr>  <int>
## 1 china   2371
country_choropleth(Usecase_2_M)

Usecase_2_F <- Usecase_2 %>% filter(Sex == 'F') %>% select(1,3)
Usecase_2_F <- inner_join(df_pop_country,Usecase_2_F, by = 'region') %>% select(1,3) 
Usecase_2_F <- as_tibble(Usecase_2_F)
Usecase_2_F <- dplyr::rename(Usecase_2_F,value = value.y)
head(Usecase_2_F)
## # A tibble: 6 × 2
##   region               value
##   <chr>                <int>
## 1 united arab emirates     7
## 2 afghanistan              5
## 3 albania                 27
## 4 armenia                 38
## 5 angola                 129
## 6 argentina              643
Usecase_2_F %>% filter(region == "china")
## # A tibble: 1 × 2
##   region value
##   <chr>  <int>
## 1 china   2770
country_choropleth(Usecase_2_F)

# The Chloropleth plots of Men and women participation show differences in the scale due to 
# women entering Olympics later than men. We can also see that there are cases where the men participation is much higher than women participation (like Denmark) or where women participation better than men's participation (like China)
## usecase3 : Medal Summary by Nations
url <- "https://en.wikipedia.org/wiki/Swimming_at_the_Summer_Olympics"
extractdata <- url %>% read_html() %>% html_table(fill = TRUE)
Swimming_Medal_Summary_byNations <- extractdata[[11]] %>% select(2,6)
head(Swimming_Medal_Summary_byNations)
## # A tibble: 6 × 2
##   Nation              Total
##   <chr>               <int>
## 1 United States (USA)   579
## 2 Australia (AUS)       213
## 3 East Germany (GDR)     92
## 4 Hungary (HUN)          76
## 5 Japan (JPN)            83
## 6 Netherlands (NED)      62
url <- "https://en.wikipedia.org/wiki/List_of_Olympic_medalists_in_speed_skating"
extract <- url %>% read_html() %>% html_table(fill = TRUE)
Skating_medal_summary <- extract[[34]] %>% select(Country,Total)
head(Skating_medal_summary)
## # A tibble: 6 × 2
##   Country        Total
##   <chr>          <chr>
## 1 Austria (AUT)  6    
## 2 Belarus        1    
## 3 Belgium        3    
## 4 Canada (CAN)   42   
## 5 China          9    
## 6 Czech Republic 7
Swimming_Medal_Summary <- Swimming_Medal_Summary_byNations
Swimming_Medal_Summary <- dplyr::rename(Swimming_Medal_Summary,Country = Nation)
Swimming_Medal_Summary <- Swimming_Medal_Summary %>% arrange(Total) %>% head(-1)
Swimming_Medal_Summary <- Swimming_Medal_Summary %>%
  mutate(Country = str_extract(Country, "[a-zA-Z' ']+"))
Swimming_Medal_Summary <- Swimming_Medal_Summary %>% mutate(Sport = "Swimming")
head(Swimming_Medal_Summary)
## # A tibble: 6 × 3
##   Country    Total Sport   
##   <chr>      <int> <chr>   
## 1 Kazakhstan     1 Swimming
## 2 Lithuania      1 Swimming
## 3 Singapore      1 Swimming
## 4 Croatia        1 Swimming
## 5 Serbia         1 Swimming
## 6 Slovenia       1 Swimming
Skating_medal_summary$Total <- as.integer(Skating_medal_summary$Total)
Skating_medal_summary <- Skating_medal_summary %>% na.omit()
Skating_medal_summary <- Skating_medal_summary %>%
  mutate(Country = str_extract(Country, "[a-zA-Z' ']+"))
Skating_medal_summary <- Skating_medal_summary %>% mutate(Sport = "Skating")
tail(Skating_medal_summary)  
## # A tibble: 6 × 3
##   Country                      Total Sport  
##   <chr>                        <int> <chr>  
## 1 Russia                          13 Skating
## 2 Olympic Athletes from Russia     3 Skating
## 3 South Korea                     20 Skating
## 4 Soviet Union                    60 Skating
## 5 Sweden                          18 Skating
## 6 United States                   71 Skating
Extract <- bind_rows(Swimming_Medal_Summary,Skating_medal_summary) 
Extract <- Extract %>% arrange(Country,Sport)

Extract %>% 
  ggplot(aes( x = reorder(Country,Total,sum), y= Total, fill=Sport,
              label = Total)) +
  geom_bar(stat="identity")+ xlab(NULL) +coord_flip() +
  geom_text(size = 3, position = position_stack(vjust = 0.5)) +
  xlab("Country") + ylab("Total number of Medals")

#The graph gives visualization of the total medals won by countries across the world in skating and swimming and at the same time the graph
#Observation from the graph: USA is obviously standing out with the medal count. 
#A section of nations have been performing well consistently where as another section of nations have been performing not so very well at all
#Through the same graph we can see the overall performance as well the as the performance 
# for each sport at the same time
##4:Male vs Female participation over time in Olympics. – Input Kaggle – Year, Gender 
rm(Medal_Summary_byGender)
library(readxl)
Medal_Summary_byGender <- readxl::read_excel("athlete_events.xlsx")
Medal_Summary_byGender <- Medal_Summary_byGender %>% filter(Sport != "Art Competitions")
## Year" for summer and winter
original <- c(1994,1998,2002,2006,2010,2014)
new <- c(1996,2000,2004,2008,2012,2016,2020)
for (i in 1:length(original)) {
  Medal_Summary_byGender$Year <- gsub(original[i], new[i], Medal_Summary_byGender$Year)
}
Medal_Summary_byGender$Year <- as.integer(Medal_Summary_byGender$Year)
## Table counting number of athletes by Year and Sex
count_byGender <- Medal_Summary_byGender %>% group_by(Year, Sex) %>% select(Year,Sex) %>% count() %>% dplyr::rename(Count=n)

count_byGender$Year <- as.integer(count_byGender$Year)
count_byGender <- count_byGender %>%  dplyr::arrange(desc(Count))
## Plot number of male/female athletes vs time
Plot_gender_distribution <- ggplot(count_byGender, aes(x=Year, y=Count, group=(Sex), color=Sex)) +
  geom_point(size=3) +
  geom_line() +
  scale_color_manual(values=c("darkgreen","Orange")) +
  labs(x = "Year", y = "Olympians Count",title = "Number of Male and Female Olympians Participation over time in Years")
Plot_gender_distribution

##here is a clear increase in the inclusion of the female after 1950 in competitions, however, the male participation is erratic.
##Zero Female Participation in 7 games. ex: Jeu De Paume,Lacrosse,Military Ski Patrol,Polo,Racquets,Roque,Tug-Of-War
##Zero Male participation in Rhythmic Gymnastics.
#   Male vs Female participation over time in Olympics Medal summary
rm(Medal_Summary_bynations)
library(readxl)
Medal_Summary_bynations<- readxl::read_excel("athlete_events.xlsx")
Medal_Summary_bynations <- Medal_Summary_bynations  %>%   filter(Medal != "NA")  %>%  na.omit()
Medal_Summary_bynations <- Medal_Summary_bynations %>%  filter(Sport != "Art Competitions")  %>%
                           dplyr::rename(region = Team)  %>%
                           group_by(NOC, Medal, Sport, Year,Sex,region) %>% summarise(isMedal=1) %>%
                           dplyr::arrange(Medal,Year)

Medal_Summary_bynations <-  Medal_Summary_bynations %>% group_by(Medal,Sex,Year) %>%
                            summarise(Count= sum(isMedal)) %>%  dplyr::arrange(Year)


# TOTAL MEDAL COUNT for Male by Year over time
Medal_Summary_bynations_Male  <- Medal_Summary_bynations %>% filter(Sex == "M") %>%
  group_by(Medal,Year,Sex) %>% summarise(Total=sum(Count)) %>% dplyr::arrange(desc(Medal))

## Plot
plot_Male <- ggplot(data=Medal_Summary_bynations_Male, aes(Year,Total ,  color=Medal)) + 
  geom_point() + ggtitle("Number of Male Olympians Participation over time in Years") +
  geom_smooth() 
plot_Male

# TOTAL MEDAL COUNT for Female by Year over time
Medal_Summary_bynations_Female  <- Medal_Summary_bynations %>% filter(Sex == "F") %>%
  group_by(Medal,Year,Sex) %>% summarise(Total=sum(Count)) %>% dplyr::arrange(desc(Medal))

plot_Female <- ggplot(data= Medal_Summary_bynations_Female, aes(Year,Total,color=Medal)) +
  geom_point() +  ggtitle("Number of female Olympians Participation over time in Years") +
  geom_smooth()
plot_Female

## Female participations started increasing after 1976 with 12 teams bringing in at least 25% women.
##In 1992, the winter sports are separated which skews the trends for both.
##Medal Dominations by country: American women : 2016
##East German & Soviet Women : 1976
##Germany women : 1936
##6.    Locations of Olympics by year choloropleth - Male Vs Female
rm(Olympic_Games_host_cities)
rm(Olympic_Games_host_cities_1)
Olympic_Games_host_cities <- "https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities"
Olympic_Games_host_cities <- Olympic_Games_host_cities[1] %>% read_html() %>% html_table(fill = TRUE)
Olympic_Games_host_cities_1 <- Olympic_Games_host_cities[[2]]  %>% select(2,3,4,6,7)
Olympic_Games_host_cities_df <- as.data.frame(Olympic_Games_host_cities_1)
Olympic_Games_host_cities_df_summer <- Olympic_Games_host_cities_df %>% 
                                      arrange(Year)
Olympic_Games_host_cities_df_summer <- Olympic_Games_host_cities_df_summer %>% dplyr::rename(region=Country)
Olympic_Games_host_cities_df_summer$region = tolower(Olympic_Games_host_cities_df_summer$region )
Olympic_Games_host_cities_df_summer[Olympic_Games_host_cities_df_summer$region=="soviet union[h]", "region"] <- "russia"
Olympic_Games_host_cities_df_summer[Olympic_Games_host_cities_df_summer$region=="russia[h]", "region"] <- "russia"
Olympic_Games_host_cities_df_summer <- Olympic_Games_host_cities_df_summer %>% 
  dplyr::rename(value=Year)
Olympic_Games_host_cities_df_summer <- Olympic_Games_host_cities_df_summer %>% select(2,3)
Olympic_Games_host_cities_df_summer <- Olympic_Games_host_cities_df_summer %>% arrange(region,value) %>% 
                                       dplyr::count(region) %>% dplyr::rename(value = n)
Olympic_Games_host_cities_df_summer$region <- str_replace(Olympic_Games_host_cities_df_summer$region, "united states","united states of america")
country_choropleth(Olympic_Games_host_cities_df_summer,num_colors=5 ) + 
scale_fill_brewer(palette="YlOrRd") +
  labs(title = "Countries hosted Olympics",
       subtitle = "Year range",
       caption = "source: https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities")

My first task was to extract the Kaggle data and filter for my sport:Athlectics (Track & Field). The Kaggle table provides important information about the contestants and the event, but with one glaring ommission: Kaggle does not include the performance (time, height, score, etc.) of the actual event. For this reason, additional data had to be scraped from Wikipedia.

kaggle_athletics <- read_excel("athlete_events.xlsx")%>%
filter(Sport == "Athletics")
head(kaggle_athletics)
## # A tibble: 6 × 15
##      ID Name      Sex   Age   Height Weight Team  NOC   Games  Year Season City 
##   <dbl> <chr>     <chr> <chr> <chr>  <chr>  <chr> <chr> <chr> <dbl> <chr>  <chr>
## 1     8 "Corneli… F     18    168    NA     Neth… NED   1932…  1932 Summer Los …
## 2     8 "Corneli… F     18    168    NA     Neth… NED   1932…  1932 Summer Los …
## 3    18 "Timo An… M     31    189    130    Finl… FIN   2000…  2000 Summer Sydn…
## 4    31 "Evald r… M     24    174    70     Esto… EST   1936…  1936 Summer Berl…
## 5    32 "Olav Au… M     23    NA     NA     Norw… NOR   1912…  1912 Summer Stoc…
## 6    34 "Jamale … M     30    187    76     Fran… FRA   2012…  2012 Summer Lond…
## # … with 3 more variables: Sport <chr>, Event <chr>, Medal <chr>

Wikipedia has separate sites for each Olympic year with separate tables, which varied by year, for male and female performance. To join the Wikipedia data with Kaggle, we first had to scrape it from the Wikipedia site, rename the columns, clean the event fields in both the Kaggle and Wikipedia files, so that they would match.

To do this, We created the “olympic_join” function function that accepted year, Wikipedia URL, table number (varied by year) and gender. The join resulted in two combined files (“M” and “F”) for Gold, Silver and Bronze for each Olympic year.

olympic_join = function(func_year, func_url,table_num,gender) {
#Get the Kaggle files and remove the prefixes "Athletics Men's" and Athletics Women's from the event field.
  kaggle_athletics_year <- kaggle_athletics%>%
  filter(Sex == gender, Year == func_year) %>% arrange(Year) %>%
  mutate(Event = trimws(str_replace_all(Event, "Athletics Men's", ""))) %>%
  mutate(Event = trimws(str_replace_all(Event, "Athletics Women's", ""))) 
  
#Scrape the data from Wikipedia given the functional year.
  athletics_year <- func_url%>%
  read_html() %>%
  html_nodes("table")%>%
  html_table(fill=TRUE)
  
#rename columns to guaranty first column is always "Event" even if column title changes for each Olympic year, and each of the pairs of name and performance for each medal is titled "Name" and "Time."
wiki_year <- athletics_year[[table_num]]
colnames(wiki_year)[1] <- "Event"
colnames(wiki_year)[2] <- "Name"
colnames(wiki_year)[3] <- "Time"
colnames(wiki_year)[4] <- "Name"
colnames(wiki_year)[5] <- "Time"
colnames(wiki_year)[6] <- "Name"
colnames(wiki_year)[7] <- "Time"

#Clean field on Wiki, removing "details". Join with Kaggle for gold medalists (rank =1).
wiki_year_g<- wiki_year %>%
  select (1:3)%>%
    mutate(Event = str_replace_all(Event,"details",""))
wiki_year_g$Rank <- "1"

#Kaggle athletic data and wikipedia data joined for year Gold.
kaggle_athletics_year_g <- kaggle_athletics_year%>%
  filter(Year == func_year, Medal == "Gold", Event == "Marathon" | Event == "100 metres" | Event == "800 metres")
final_year_g <- left_join(kaggle_athletics_year_g,wiki_year_g,by = "Event")

#Repeat year for Silver
wiki_year_s<- wiki_year %>%
  select (1, 4:5)%>%
    mutate(Event = str_replace_all(Event,"details",""))
wiki_year_s$Rank <- "2"

#Kaggle athletic data and wikipedia data joined for year and silver medalists.
kaggle_athletics_year_s <- kaggle_athletics%>%
  filter(Year == func_year, Medal == "Silver", Event == "Marathon" | Event == "100 metres" | Event == "800 metres")
final_year_s <- left_join(kaggle_athletics_year_s,wiki_year_s,by = "Event")

#Do the same for Bronze
wiki_year_b<- wiki_year %>%
  select (1, 6:7)%>%
    mutate(Event = str_replace_all(Event,"details",""))
wiki_year_b$Rank <- "3"

#Kaggle athletic data and wikipedia data joined for year and bronze medalists.
kaggle_athletics_year_b <- kaggle_athletics%>%
  filter(Year == func_year, Medal == "Bronze", Event == "Marathon" | Event == "100 metres" | Event == "800 metres")
final_year_b <- left_join(kaggle_athletics_year_b,wiki_year_b,by = "Event")

#Consolidate gold, silver and bronze for year 
final_year <- final_year_g %>%
   rbind.data.frame(final_year_s) %>%
   rbind.data.frame(final_year_b)
#   select(1:7, 10:13, 15, 16, 19)
final_year
}

Begin calling the function olympic_join for each of the Olympics by year, starting in 2016.

result_m <- olympic_join("2016","https://en.wikipedia.org/wiki/Athletics_at_the_2016_Summer_Olympics", 6, "M")
result_w <- olympic_join("2016","https://en.wikipedia.org/wiki/Athletics_at_the_2016_Summer_Olympics", 7, "F")

The rows of each subsequent years are appended to 2016, separated by sex.

result_new <- olympic_join("2012", "https://en.wikipedia.org/wiki/Athletics_at_the_2012_Summer_Olympics",7, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("2012", "https://en.wikipedia.org/wiki/Athletics_at_the_2012_Summer_Olympics",8, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("2008", "https://en.wikipedia.org/wiki/Athletics_at_the_2008_Summer_Olympics",4, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("2008", "https://en.wikipedia.org/wiki/Athletics_at_the_2012_Summer_Olympics",5, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("2004", "https://en.wikipedia.org/wiki/Athletics_at_the_2004_Summer_Olympics",3, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("2004", "https://en.wikipedia.org/wiki/Athletics_at_the_2004_Summer_Olympics",4, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("2000", "https://en.wikipedia.org/wiki/Athletics_at_the_2000_Summer_Olympics",3, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("2000", "https://en.wikipedia.org/wiki/Athletics_at_the_2000_Summer_Olympics",4, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1996", "https://en.wikipedia.org/wiki/Athletics_at_the_1996_Summer_Olympics",2, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1996", "https://en.wikipedia.org/wiki/Athletics_at_the_1996_Summer_Olympics",3, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1992", "https://en.wikipedia.org/wiki/Athletics_at_the_1992_Summer_Olympics",2, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1992", "https://en.wikipedia.org/wiki/Athletics_at_the_1992_Summer_Olympics",3, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
head(result_w)
## # A tibble: 6 × 18
##       ID Name.x   Sex   Age   Height Weight Team  NOC   Games  Year Season City 
##    <dbl> <chr>    <chr> <chr> <chr>  <chr>  <chr> <chr> <chr> <dbl> <chr>  <chr>
## 1 108532 Mokgadi… F     25    178    70     Sout… RSA   2016…  2016 Summer Rio …
## 2 116424 Jemima … F     31    160    45     Kenya KEN   2016…  2016 Summer Rio …
## 3 119903 Elaine … F     24    167    57     Jama… JAM   2016…  2016 Summer Rio …
## 4  36870 Shelly-… F     25    160    57     Jama… JAM   2012…  2012 Summer Lond…
## 5  39396 Erba Ti… F     24    165    48     Ethi… ETH   2012…  2012 Summer Lond…
## 6 106272 Mariya … F     26    172    60     Russ… RUS   2012…  2012 Summer Lond…
## # … with 6 more variables: Sport <chr>, Event <chr>, Medal <chr>, Name.y <chr>,
## #   Time <chr>, Rank <chr>
result_new <- olympic_join("1988", "https://en.wikipedia.org/wiki/Athletics_at_the_1988_Summer_Olympics",2, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1988", "https://en.wikipedia.org/wiki/Athletics_at_the_1988_Summer_Olympics",3, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1984", "https://en.wikipedia.org/wiki/Athletics_at_the_1984_Summer_Olympics",3, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1984", "https://en.wikipedia.org/wiki/Athletics_at_the_1984_Summer_Olympics",4, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1980", "https://en.wikipedia.org/wiki/Athletics_at_the_1980_Summer_Olympics",4, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1980", "https://en.wikipedia.org/wiki/Athletics_at_the_1980_Summer_Olympics",5, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1976", "https://en.wikipedia.org/wiki/Athletics_at_the_1976_Summer_Olympics",1, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1976", "https://en.wikipedia.org/wiki/Athletics_at_the_1976_Summer_Olympics",2, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1972", "https://en.wikipedia.org/wiki/Athletics_at_the_1972_Summer_Olympics",1, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1972", "https://en.wikipedia.org/wiki/Athletics_at_the_1972_Summer_Olympics",2, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1968", "https://en.wikipedia.org/wiki/Athletics_at_the_1968_Summer_Olympics",2, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1968", "https://en.wikipedia.org/wiki/Athletics_at_the_1968_Summer_Olympics",3, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1964", "https://en.wikipedia.org/wiki/Athletics_at_the_1964_Summer_Olympics",1, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1964", "https://en.wikipedia.org/wiki/Athletics_at_the_1964_Summer_Olympics",2, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1960", "https://en.wikipedia.org/wiki/Athletics_at_the_1960_Summer_Olympics",3, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1960", "https://en.wikipedia.org/wiki/Athletics_at_the_1960_Summer_Olympics",4, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1956", "https://en.wikipedia.org/wiki/Athletics_at_the_1956_Summer_Olympics",2, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1956", "https://en.wikipedia.org/wiki/Athletics_at_the_1956_Summer_Olympics",3, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1952", "https://en.wikipedia.org/wiki/Athletics_at_the_1952_Summer_Olympics",2, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1952", "https://en.wikipedia.org/wiki/Athletics_at_the_1952_Summer_Olympics",3, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1948", "https://en.wikipedia.org/wiki/Athletics_at_the_1948_Summer_Olympics",3, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1948", "https://en.wikipedia.org/wiki/Athletics_at_the_1948_Summer_Olympics",4, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
result_new <- olympic_join("1936", "https://en.wikipedia.org/wiki/Athletics_at_the_1936_Summer_Olympics",3, "M")
result_m <- result_m %>%
rbind.data.frame(result_new)
head(result_m)
## # A tibble: 6 × 18
##       ID Name.x   Sex   Age   Height Weight Team  NOC   Games  Year Season City 
##    <dbl> <chr>    <chr> <chr> <chr>  <chr>  <chr> <chr> <chr> <dbl> <chr>  <chr>
## 1  13029 Usain S… M     29    196    95     Jama… JAM   2016…  2016 Summer Rio …
## 2  60589 Eliud K… M     31    167    57     Kenya KEN   2016…  2016 Summer Rio …
## 3 103274 David L… M     27    190    76     Kenya KEN   2016…  2016 Summer Rio …
## 4  13029 Usain S… M     25    196    95     Jama… JAM   2012…  2012 Summer Lond…
## 5  60618 Stephen… M     23    172    56     Ugan… UGA   2012…  2012 Summer Lond…
## 6 103274 David L… M     23    190    76     Kenya KEN   2012…  2012 Summer Lond…
## # … with 6 more variables: Sport <chr>, Event <chr>, Medal <chr>, Name.y <chr>,
## #   Time <chr>, Rank <chr>
result_new <- olympic_join("1936", "https://en.wikipedia.org/wiki/Athletics_at_the_1936_Summer_Olympics",4, "F")
result_w <- result_w %>%
rbind.data.frame(result_new)
head(result_w)
## # A tibble: 6 × 18
##       ID Name.x   Sex   Age   Height Weight Team  NOC   Games  Year Season City 
##    <dbl> <chr>    <chr> <chr> <chr>  <chr>  <chr> <chr> <chr> <dbl> <chr>  <chr>
## 1 108532 Mokgadi… F     25    178    70     Sout… RSA   2016…  2016 Summer Rio …
## 2 116424 Jemima … F     31    160    45     Kenya KEN   2016…  2016 Summer Rio …
## 3 119903 Elaine … F     24    167    57     Jama… JAM   2016…  2016 Summer Rio …
## 4  36870 Shelly-… F     25    160    57     Jama… JAM   2012…  2012 Summer Lond…
## 5  39396 Erba Ti… F     24    165    48     Ethi… ETH   2012…  2012 Summer Lond…
## 6 106272 Mariya … F     26    172    60     Russ… RUS   2012…  2012 Summer Lond…
## # … with 6 more variables: Sport <chr>, Event <chr>, Medal <chr>, Name.y <chr>,
## #   Time <chr>, Rank <chr>

Select columns from women’s and men’s data frames, eliminating duplicates that were created in the join.

athletics_m_final <- result_m%>%
select(2:7,10,12:14,17,18)
head(athletics_m_final)
## # A tibble: 6 × 12
##   Name.x     Sex   Age   Height Weight Team   Year City  Sport Event Time  Rank 
##   <chr>      <chr> <chr> <chr>  <chr>  <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 Usain St.… M     29    196    95     Jama…  2016 Rio … Athl… 100 … 9.81  1    
## 2 Eliud Kip… M     31    167    57     Kenya  2016 Rio … Athl… Mara… 2:08… 1    
## 3 David Lek… M     27    190    76     Kenya  2016 Rio … Athl… 800 … 1:42… 1    
## 4 Usain St.… M     25    196    95     Jama…  2012 Lond… Athl… 100 … 9.63… 1    
## 5 Stephen K… M     23    172    56     Ugan…  2012 Lond… Athl… Mara… 2:08… 1    
## 6 David Lek… M     23    190    76     Kenya  2012 Lond… Athl… 800 … 1:40… 1
athletics_w_final <- result_w%>%
select(2:7,10,12:14,17,18)
head(athletics_w_final)
## # A tibble: 6 × 12
##   Name.x     Sex   Age   Height Weight Team   Year City  Sport Event Time  Rank 
##   <chr>      <chr> <chr> <chr>  <chr>  <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 Mokgadi C… F     25    178    70     Sout…  2016 Rio … Athl… 800 … 1:55… 1    
## 2 Jemima Je… F     31    160    45     Kenya  2016 Rio … Athl… Mara… 2:24… 1    
## 3 Elaine Th… F     24    167    57     Jama…  2016 Rio … Athl… 100 … 10.71 1    
## 4 Shelly-An… F     25    160    57     Jama…  2012 Lond… Athl… 100 … 10.75 1    
## 5 Erba Tiki… F     24    165    48     Ethi…  2012 Lond… Athl… Mara… 2:23… 1    
## 6 Mariya Se… F     26    172    60     Russ…  2012 Lond… Athl… 800 … <NA>  <NA>

Remove non-numeric notes from the time fields. Convert hms to decimal seconds where needed for both men and women. Compute BMI = height/weight**2. As professor suggestion, we substituted BMI for weight when exploring correlation. This is the representation Correlation: Performance vs. : BMI.

athletics_m_final <-athletics_m_final %>% na.omit() %>%
    mutate(Time = str_extract(Time,"[0-9':''.']+"))

athletics_m_hms <- athletics_m_final %>%
  filter(str_detect(athletics_m_final$Time,":")) 
  athletics_m_hms$Time <- period_to_seconds(hms(athletics_m_hms$Time))

athletics_m_final <- athletics_m_final %>%
  filter(str_detect(athletics_m_final$Time,":") == "FALSE") %>% 
rbind(athletics_m_hms)%>%
arrange(Event)
head(athletics_m_final)
## # A tibble: 6 × 12
##   Name.x     Sex   Age   Height Weight Team   Year City  Sport Event Time  Rank 
##   <chr>      <chr> <chr> <chr>  <chr>  <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 Usain St.… M     29    196    95     Jama…  2016 Rio … Athl… 100 … 9.81  1    
## 2 Usain St.… M     25    196    95     Jama…  2012 Lond… Athl… 100 … 9.63  1    
## 3 Usain St.… M     21    196    95     Jama…  2008 Beij… Athl… 100 … 9.69  1    
## 4 Justin Al… M     22    186    80     Unit…  2004 Athi… Athl… 100 … 9.85  1    
## 5 Maurice G… M     26    176    80     Unit…  2000 Sydn… Athl… 100 … 9.87  1    
## 6 Donovan B… M     28    183    82     Cana…  1996 Atla… Athl… 100 … 9.84  1
athletics_w_final <-athletics_w_final %>% na.omit() %>%
    mutate(Time = str_extract(Time,"[0-9':''.']+"))

athletics_w_hms <- athletics_w_final %>%
  filter(str_detect(athletics_w_final$Time,":")) 
  athletics_w_hms$Time <- period_to_seconds(hms(athletics_w_hms$Time))

athletics_w_final <- athletics_w_final %>%
  filter(str_detect(athletics_w_final$Time,":") == "FALSE") %>% 
rbind(athletics_w_hms )%>%
 arrange(Event)
 head(athletics_w_final)
## # A tibble: 6 × 12
##   Name.x     Sex   Age   Height Weight Team   Year City  Sport Event Time  Rank 
##   <chr>      <chr> <chr> <chr>  <chr>  <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 Elaine Th… F     24    167    57     Jama…  2016 Rio … Athl… 100 … 10.71 1    
## 2 Shelly-An… F     25    160    57     Jama…  2012 Lond… Athl… 100 … 10.75 1    
## 3 Yolanda G… F     29    160    55     Unit…  1996 Atla… Athl… 100 … 10.94 1    
## 4 Yolanda G… F     25    160    55     Unit…  1992 Barc… Athl… 100 … 10.82 1    
## 5 Delorez F… F     28    170    57     Unit…  1988 Seoul Athl… 100 … 10.54 1    
## 6 Evelyn As… F     27    165    52     Unit…  1984 Los … Athl… 100 … 10.97 1
athletics_m_final$BMI <- as.numeric(athletics_m_final$Weight)/((as.numeric(athletics_m_final$Height)/100)**2)
athletics_w_final$BMI <-        
 as.numeric(athletics_w_final$Weight)/((as.numeric(athletics_w_final$Height)/100)**2)

Perform a multi-variate correlation of BMI with Time for both men and women for each of the three designated Athletics Events(Marathon, 100 hundred metres, and 800 metres) Only 100 meter produced a significant correlation, with the p-value of the estimate (approximately 0.018 for the men). The r_square = 0.2871 was not high enough to conclude that the variation in the speed was solely due to the athlete’s BMI. Clearly, a multi-variate model, including other explanatory variables should be explored.

marathon_m <- athletics_m_final %>%
  filter(Event == "Marathon") 
marathon_lm_m = lm(as.numeric(marathon_m$Time)~marathon_m$BMI)
  summary(marathon_lm_m)
## 
## Call:
## lm(formula = as.numeric(marathon_m$Time) ~ marathon_m$BMI)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -480.8 -358.2 -136.1  284.9 1169.8 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)     6813.85    2164.09   3.149  0.00621 **
## marathon_m$BMI    65.59     109.41   0.599  0.55728   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 490.5 on 16 degrees of freedom
## Multiple R-squared:  0.02196,    Adjusted R-squared:  -0.03916 
## F-statistic: 0.3593 on 1 and 16 DF,  p-value: 0.5573
marathon_w <- athletics_w_final %>%
  filter(Event == "Marathon") 
marathon_lm_w = lm(as.numeric(marathon_w$Time)~marathon_w$BMI)
  summary(marathon_lm_w)
## 
## Call:
## lm(formula = as.numeric(marathon_w$Time) ~ marathon_w$BMI)
## 
## Residuals:
##        1        2        3        4        5        6        7 
##   45.958  -16.822 -276.171   -4.299  208.549   67.625  -24.841 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)     6671.41    1012.54   6.589  0.00121 **
## marathon_w$BMI   109.60      53.54   2.047  0.09600 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 159.6 on 5 degrees of freedom
## Multiple R-squared:  0.456,  Adjusted R-squared:  0.3471 
## F-statistic:  4.19 on 1 and 5 DF,  p-value: 0.096
hundred_m <- athletics_m_final %>%
  filter(Event == "100 metres") 
hundred_lm_m = lm(as.numeric(hundred_m$Time)~hundred_m$BMI)
  summary(hundred_lm_m)
## 
## Call:
## lm(formula = as.numeric(hundred_m$Time) ~ hundred_m$BMI)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33756 -0.15240 -0.00886  0.13275  0.35931 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   11.96671    0.73988  16.174 9.31e-12 ***
## hundred_m$BMI -0.08084    0.03089  -2.617    0.018 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2086 on 17 degrees of freedom
## Multiple R-squared:  0.2871, Adjusted R-squared:  0.2452 
## F-statistic: 6.847 on 1 and 17 DF,  p-value: 0.01805
hundred_w <- athletics_w_final %>%
  filter(Event == "100 metres") 
hundred_lm_w = lm(as.numeric(hundred_w$Time)~hundred_w$BMI)
  summary(hundred_lm_w)
## 
## Call:
## lm(formula = as.numeric(hundred_w$Time) ~ hundred_w$BMI)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.55820 -0.21539 -0.06017  0.30644  0.78870 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   10.79371    1.30320   8.282 9.12e-07 ***
## hundred_w$BMI  0.01544    0.06271   0.246    0.809    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3704 on 14 degrees of freedom
## Multiple R-squared:  0.004311,   Adjusted R-squared:  -0.06681 
## F-statistic: 0.06062 on 1 and 14 DF,  p-value: 0.8091
eight_hundred_m <- athletics_m_final %>%
  filter(Event == "800 metres") 
eight_hundred_lm_m = lm(as.numeric(eight_hundred_m$Time)~eight_hundred_m$BMI)
  summary(eight_hundred_lm_m)
## 
## Call:
## lm(formula = as.numeric(eight_hundred_m$Time) ~ eight_hundred_m$BMI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -224.40 -101.87  -19.36   71.54  414.27 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          6064.99     547.33  11.081 6.47e-09 ***
## eight_hundred_m$BMI    11.89      25.20   0.472    0.643    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 163.8 on 16 degrees of freedom
## Multiple R-squared:  0.01373,    Adjusted R-squared:  -0.04791 
## F-statistic: 0.2227 on 1 and 16 DF,  p-value: 0.6433
eight_hundred_w <- athletics_w_final %>%
  filter(Event == "800 metres") 
eight_hundred_lm_w = lm(as.numeric(eight_hundred_w$Time)~eight_hundred_w$BMI)
  summary(eight_hundred_lm_w)
## 
## Call:
## lm(formula = as.numeric(eight_hundred_w$Time) ~ eight_hundred_w$BMI)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -255.54 -120.65  -45.21   97.93  374.22 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6912.428    580.984  11.898 3.17e-07 ***
## eight_hundred_w$BMI    7.934     28.403   0.279    0.786    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 189.4 on 10 degrees of freedom
## Multiple R-squared:  0.007743,   Adjusted R-squared:  -0.09148 
## F-statistic: 0.07803 on 1 and 10 DF,  p-value: 0.7857

Interestingly, the second order component of weight in the formula for BMI results in an improvement in performance (seconds decline) as BMI increases, up until approximately BMI = 25. Above that value, performance decreases. Perhaps, a certain amount of muscle mass is optimum to achieve the maximum amount of human speed, but above that BMI, it takes too much force to accelerate the mass.

 hundred_m_plot<- ggplot(data = hundred_m, aes(BMI, as.numeric(Time))) +xlab("BMI") + ylab("Sec") +
  geom_point() +
  ggtitle("Men's 100 Yard Dash Performance vs. BMI") +
  geom_smooth()
hundred_m_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

The comparison of women’s BMI to 100 yard dash performance does not replicate this trend, as was anticipated by the high p-value for the estimate and low r_square.

##Correlation: Performance
 hundred_w_plot<- ggplot(data = hundred_w, aes(BMI, as.numeric(Time))) +xlab("BMI") + ylab("Sec") +
  geom_point() +
  ggtitle("Women's 100 Yard Dash Performance vs. BMI") +
  geom_smooth()
hundred_w_plot

Select columns from Athletics tables for plotting women vs. men’s performance over time.

Olympic_m<- athletics_m_final 
Olympic_m$Sex = "M"

Olympic_w<- athletics_w_final
Olympic_w$Sex = "F"
 
Olympics_Athletics <- Olympic_m %>%
  rbind(Olympic_w) %>%
  select (9,10,7,2,11,12) 
  #filter(Rank == "1" )
head(Olympics_Athletics)
## # A tibble: 6 × 6
##   Sport     Event       Year Sex   Time  Rank 
##   <chr>     <chr>      <dbl> <chr> <chr> <chr>
## 1 Athletics 100 metres  2016 M     9.81  1    
## 2 Athletics 100 metres  2012 M     9.63  1    
## 3 Athletics 100 metres  2008 M     9.69  1    
## 4 Athletics 100 metres  2004 M     9.85  1    
## 5 Athletics 100 metres  2000 M     9.87  1    
## 6 Athletics 100 metres  1996 M     9.84  1

Plot women’s (green) performance against men’s performance (red) over time. Because of the extreme differences in time scale, a facet wrap did not generate meaningful results.

Women were not allowed to run the Marathon until 1984. An investigation of the sharp increase in time for the women’s Marathon in 1992 was due to a course that finished up a steep incline, sapping their strength at the end of the event.

## Male vs. Female: Track & Field
marathon <- Olympics_Athletics %>%
  filter(Event == "Marathon") 
  marathon$Hours = as.numeric(marathon$Time)/3600
head(marathon)
## # A tibble: 6 × 7
##   Sport     Event     Year Sex   Time  Rank  Hours
##   <chr>     <chr>    <dbl> <chr> <chr> <chr> <dbl>
## 1 Athletics Marathon  2016 M     7724  1      2.15
## 2 Athletics Marathon  2012 M     7681  1      2.13
## 3 Athletics Marathon  2008 M     7592  1      2.11
## 4 Athletics Marathon  2000 M     7811  1      2.17
## 5 Athletics Marathon  1996 M     7956  1      2.21
## 6 Athletics Marathon  1992 M     8003  1      2.22
Olympic_perf_plot <- ggplot(data = marathon, aes(Year, Hours, colour=Sex)) +
  coord_cartesian(ylim=c(2,3)) +
  geom_point() +
  ggtitle("Marathon") +
  geom_smooth()
Olympic_perf_plot

## Male vs. Female: Speed Skating

url <- "https://en.wikipedia.org/wiki/Speed_skating_at_the_Winter_Olympics"

scraplinks <- function(url){
  # Create an html document from the url
  webpage <- xml2::read_html(url)
  # Extract the URLs
  url_ <- webpage %>%
    rvest::html_nodes("a") %>%
    rvest::html_attr("href")
  # Extract the link text
  link_ <- webpage %>%
    rvest::html_nodes("a") %>%
    rvest::html_text()
  return(tibble(link = link_, url = url_))
}

url_links <- scraplinks(url)
Men_master <- url_links %>% filter(str_detect(url,"_Men")) %>% select(1,2)
Men_master <- Men_master %>% mutate(Year = substr(url,28,31)) %>% select(3,2)
Men_master <- Men_master %>% mutate(url = paste("https://en.wikipedia.org",url,sep=""))
Men_master <- Men_master %>% mutate(event = word(url,-2,sep='_')) %>% 
  mutate(event = gsub(",","",event)) %>% select(1,3,2)
Men_master <- Men_master %>% filter (!is.na(as.numeric(event)))

#500***********************************************************************************************
Men_master_500 <- Men_master %>% filter(event=="500")
n = count(Men_master_500)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Men_master_500$Year[i]
  event <- str_c(Men_master_500$event[i],"_meters")
  extractf <- Men_master_500$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1968" | year == "1984" | year == "1992" ) ~ 5,
    (year == "2014") ~ 8,
    (year == "2018") ~ 7,
    (year == "1932") ~ 9,
    TRUE ~ 6
  )
  sitef <- extractf[[k]]
  if (year == "2006") {
    sitef <- sitef %>% head(3) %>% select(`Total Time`)
    sitef <- rename(sitef, Time = `Total Time`)
  } 
  if (year == "1998" | year == "2002"| year == "2010" | year == "2014") {
    sitef <- sitef %>% head(3) %>% select(Total)
    sitef <- rename(sitef, Time = Total)
  }
  rank1 <- sitef %>% head(3) %>% select(Time)
  if (year == "1932"){
    rank1 <- sitef %>% head(1) %>% select(Time)
  }
  rank1 <- rank1 %>% mutate(Sport = "Skating", Event = event, Year = year, Gender = "Male" )
  rank1 <- rank1 %>% select (2,3,4,5,1) %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}

r_500 <- rank1 %>% na.omit

#1000**********************************************************************************************
Men_master_1000 <- Men_master %>% filter(event=="1000")
n = count(Men_master_1000)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Men_master_1000$Year[i]
  event <- str_c(Men_master_1000$event[i],"_meters")
  extractf <- Men_master_1000$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1984" | year == "1992" | year == "2006" | year == "2010") ~ 5,
    TRUE ~ 6
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Skating", Event = event, Year = year, Gender = "Male" )
  rank1 <- rank1 %>% select (2,3,4,5,1) %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}
r_1000 <- rank1 %>% na.omit

#1500**********************************************************************************************
Men_master_1500 <- Men_master %>% filter(event=="1500")
n = count(Men_master_1500)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Men_master_1500$Year[i]
  event <- str_c(Men_master_1500$event[i],"_meters")
  extractf <- Men_master_1500$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1964" | year == "1984" | year == "1992" | year == "2006" | year == "2010") ~ 5,
    (year == "1932") ~ 9,
    TRUE ~ 6
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  if (year == "1932"){
    rank1 <- sitef %>% head(1) %>% select(Time)
  }
  rank1 <- rank1 %>% mutate(Sport = "Skating", Event = event, Year = year, Gender = "Male" )
  rank1 <- rank1 %>% select (2,3,4,5,1) %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}

r_1500 <- rank1 %>% na.omit

#5000**********************************************************************************************
Men_master_5000 <- Men_master %>% filter(event=="5000")
n = count(Men_master_5000)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Men_master_5000$Year[i]
  event <- str_c(Men_master_5000$event[i],"_meters")
  extractf <- Men_master_5000$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1972" | year == "1976" | year == "1984" | year == "1992" | year == "2006") ~ 5,
    (year == "2014") ~ 7,
    TRUE ~ 6
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Skating", Event = event, Year = year, Gender = "Male" )
  rank1 <- rank1 %>% select (2,3,4,5,1) %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}
r_5000 <- rank1 %>% na.omit

#**************************************************************************************************
rm(Men_extract)
Men_extract <- bind_rows(r_500,r_1000,r_1500,r_5000)
head(Men_extract)
## # A tibble: 6 × 7
##   Sport   Event      Year  Gender Time      Rank Timeinsec
##   <chr>   <chr>      <chr> <chr>  <chr>    <int>     <dbl>
## 1 Skating 500_meters 1924  Male   0.0.44.0     1      44  
## 2 Skating 500_meters 1924  Male   0.0.44.2     2      44.2
## 3 Skating 500_meters 1924  Male   0.0.44.8     3      44.8
## 4 Skating 500_meters 1928  Male   0.0.43.4     1      43.4
## 5 Skating 500_meters 1928  Male   0.0.43.4     2      43.4
## 6 Skating 500_meters 1928  Male   0.0.43.6     3      43.6
#**************************************************************************************************
Women_master <- url_links %>% filter(str_detect(url,"_Women")) %>% select(1,2)
Women_master <- Women_master %>% mutate(Year = substr(url,28,31)) %>% select(3,2)
Women_master <- Women_master %>% mutate(url = paste("https://en.wikipedia.org",url,sep=""))
Women_master <- Women_master %>% mutate(event = word(url,-2,sep='_')) %>% 
  mutate(Event = gsub(",","",event)) %>% select(1,3,2)
Women_master <- Women_master %>% filter (!is.na(as.numeric(event)))

#Women 500*****************************************************************************************
Women_master_500 <- Women_master %>% filter(event=="500")
n = count(Women_master_500)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Women_master_500$Year[i]
  event <- str_c(Women_master_500$event[i],"_meters")
  extractf <- Women_master_500$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1968"| year == "1992" | year == "1994") ~ 5,
    TRUE ~ 6
  )
  sitef <- extractf[[k]]
  if (year == "1998" | year == "2002" | year == "2010") {
    #glimpse(sitef)
    sitef <- sitef %>% head(3) %>% select(Total)
    sitef <- rename(sitef, Time = Total)
  }
  if (year == "2006") {
    #glimpse(sitef)
    sitef <- sitef %>% head(3) %>% select(`Total Time`)
    sitef <- rename(sitef, Time = `Total Time`)
  }
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Skating", Event = event, Year = year, Gender = "Female" )
  rank1 <- rank1 %>% select (2,3,4,5,1) %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}
r_w_500 <- rank1 %>% na.omit

#Women 1000 ***************************************************************************************
Women_master_1000 <- Women_master %>% filter(event=="1000")
n = count(Women_master_1000)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Women_master_1000$Year[i]
  event <- str_c(Women_master_1000$event[i],"_meters")
  extractf <- Women_master_1000$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1992" | year == "1994" | year == "2006" | year == "2010") ~ 5,
    TRUE ~ 6
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Skating", Event = event, Year = year, Gender = "Female" )
  rank1 <- rank1 %>% select (2,3,4,5,1) %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}
r_w_1000 <- rank1 %>% na.omit

#Women 1500 ***************************************************************************************
Women_master_1500 <- Women_master %>% filter(event=="1500")
n = count(Women_master_1500)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Women_master_1500$Year[i]
  event <- str_c(Women_master_1500$event[i],"_meters")
  extractf <- Women_master_1500$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1992" | year == "1994" | year == "2006" | year == "2010" | year == "2018") ~ 5,
    TRUE ~ 6
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Skating", Event = event, Year = year, Gender = "Female")
  rank1 <- rank1 %>% select (2,3,4,5,1) %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}
r_w_1500 <- rank1 %>% na.omit

#Women 5000 ***************************************************************************************
Women_master_5000 <- Women_master %>% filter(event=="5000")
n = count(Women_master_5000)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Women_master_5000$Year[i]
  event <- str_c(Women_master_5000$event[i],"_meters")
  extractf <- Women_master_5000$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1992" | year == "1994" | year == "2006" | year == "2010") ~ 5,
    TRUE ~ 6
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Skating", Event = event, Year = year, Gender = "Female" )
  rank1 <- rank1 %>% select (2,3,4,5,1) %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}
r_w_5000 <- rank1 %>% na.omit

rm(Women_extract)
Women_extract <- bind_rows(r_w_500,r_w_1000,r_w_1500,r_w_5000)
head(Women_extract)
## # A tibble: 6 × 7
##   Sport   Event      Year  Gender Time      Rank Timeinsec
##   <chr>   <chr>      <chr> <chr>  <chr>    <int>     <dbl>
## 1 Skating 500_meters 1932  Female 0.0.60.4     1      60.4
## 2 Skating 500_meters 1960  Female 0.0.45.9     1      45.9
## 3 Skating 500_meters 1960  Female 0.0.46.0     2      46  
## 4 Skating 500_meters 1960  Female 0.0.46.1     3      46.1
## 5 Skating 500_meters 1964  Female 0.0.45       1      45  
## 6 Skating 500_meters 1964  Female 0.0.45.4     2      45.4

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Combine men and women’s files for plotting.

Olympics_speed_skating <- Men_extract %>%
  rbind(Women_extract)
head(Olympics_speed_skating )
## # A tibble: 6 × 7
##   Sport   Event      Year  Gender Time      Rank Timeinsec
##   <chr>   <chr>      <chr> <chr>  <chr>    <int>     <dbl>
## 1 Skating 500_meters 1924  Male   0.0.44.0     1      44  
## 2 Skating 500_meters 1924  Male   0.0.44.2     2      44.2
## 3 Skating 500_meters 1924  Male   0.0.44.8     3      44.8
## 4 Skating 500_meters 1928  Male   0.0.43.4     1      43.4
## 5 Skating 500_meters 1928  Male   0.0.43.4     2      43.4
## 6 Skating 500_meters 1928  Male   0.0.43.6     3      43.6

plot 500 metres Speed Skating

m500 <- Olympics_speed_skating %>%
  filter(Event == "500_meters") 
  m500$Sec = m500$Timeinsec
Olympic_perf_plot <- ggplot(data = m500, aes(Year, Sec, colour=Gender)) +
  coord_cartesian(ylim = c(30,80)) +
  geom_point() +
  ggtitle("500 metres: Olympic Women Speed Skating Performance vs. Men") +
  geom_smooth()
Olympic_perf_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

plot 1000 meter

m1000 <- Olympics_speed_skating %>%
  filter(Event == "1000_meters") 
  m1000$Sec = m1000$Timeinsec
Olympic_perf_plot <- ggplot(data = m1000, aes(Year, Sec, colour=Gender)) +
  coord_cartesian(ylim = c(60,130)) +
  geom_point() +
  ggtitle("1000 metres") +
  geom_smooth()
Olympic_perf_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

plot 1500

m1500 <- Olympics_speed_skating %>%
  filter(Event == "1500_meters") 
  m1500$Sec = m1500$Timeinsec
Olympic_perf_plot <- ggplot(data = m1500, aes(Year, Sec, colour=Gender)) +
  coord_cartesian(ylim = c(100,200)) +
  geom_point() +
  ggtitle("1500 metres: Olympic Women Speed Skating Performance vs. Men") +
  geom_smooth()
  
Olympic_perf_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

plot 5000 meters

m5000 <- Olympics_speed_skating %>%
  filter(Event == "5000_meters") 
  m5000$Sec = m5000$Timeinsec
Olympic_perf_plot <- ggplot(data = m5000, aes(Year, Sec, colour=Gender)) +
  coord_cartesian(ylim = c(300,600)) +
  geom_point() +
  ggtitle("5000 metres") +
  geom_smooth()
  
Olympic_perf_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

knitr::opts_chunk$set(echo = TRUE)

url <- "https://en.wikipedia.org/wiki/Swimming_at_the_Summer_Olympics"

scraplinks <- function(url){
# Create an html document from the url
webpage <- xml2::read_html(url)
# Extract the URLs
url_ <- webpage %>%
rvest::html_nodes("a") %>%
rvest::html_attr("href")
# Extract the link text
link_ <- webpage %>%
rvest::html_nodes("a") %>%
rvest::html_text()
return(tibble(link = link_, url = url_))
}

url_links <- scraplinks(url)
Men_master <- url_links %>% filter(str_detect(url,"_Men"))  %>% filter(str_detect(link,"X")) %>%
            filter(str_detect(url,"_sailors",TRUE)) %>%
            filter(str_detect(url,"_yard",TRUE)) %>% 
            filter(str_detect(url,"_mile",TRUE)) %>% 
            filter(str_detect(url,"_medley",TRUE)) %>% 
            filter(str_detect(url,"_relay",TRUE)) %>% 
            filter(str_detect(url,"_obstacle",TRUE)) %>% 
            filter(str_detect(url,"_underwater",TRUE)) %>% 
            filter(str_detect(url,"_marathon",TRUE)) %>%
            filter(str_detect(url,"_team",TRUE)) %>% select(1,2)
Men_master <- Men_master %>% mutate(Year = substr(url,23,26)) %>% select(3,2)
Men_master <- Men_master %>% mutate(url = paste("https://en.wikipedia.org",url,sep=""))
Men_master <- Men_master %>% mutate(event = word(url,9,end=11,sep='_')) %>% 
  mutate(event = gsub(",","",event)) %>% select(1,3,2)


### ************************** 50_metre_freestyle ### ************************** 
Men_master_50 <- Men_master %>% filter(event=="50_metre_freestyle")
n = count(Men_master_50)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Men_master_50$Year[i]
  event <- Men_master_50$event[i]
  extractf <- Men_master_50$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1988" | year == "2020" ) ~ 9,(year == "1992" | year == "1996" | year == "2008" ) ~ 8,
    (year == "2000" | year == "2004" | year == "2012" | year == "2016") ~ 7, TRUE ~ 6
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Swimming", Event = event, Year = year, Gender = "Male" )
  rank1 <- rank1 %>% select (2,3,4,5,1) %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}
rank50mtre_fs <- rank1 %>% na.omit()
### ************************** 50_metre_freestyle Men ### ************************** 


### ************************** 100_metre_freestyle Men ### ************************** 

Men_master_100_fs <- Men_master %>% filter(event=="100_metre_freestyle")
n = count(Men_master_100_fs)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Men_master_100_fs$Year[i]
  event <- Men_master_100_fs$event[i]
  extractf <- Men_master_100_fs$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1896" ) ~ 5,
    (year == "1960" | year == "1964" | year == "1968" | year == "1980" | year == "1992" | year == "1996" | year == "2000" | year == "2004" | year == "2008" | year == "2012" | year == "2016") ~ 8, 
    (year == "1952" | year == "1956" | year == "1972" | year == "1976" | year == "1984" | year == "1988" | year == "2020") ~ 9,
    (year == "2000" | year == "2004" | year == "2012" | year == "2016") ~ 7,
    (year == "1908") ~ 17,
    (year == "1912") ~ 20, 
    (year == "1932") ~ 12,
    (year == "1920" | year == "1924" | year == "1948") ~ 14, 
    (year == "1936") ~ 15, 
    (year == "1928") ~ 16, 
    TRUE ~ 6
  )
  sitef <- extractf[[k]]
  if (colnames(sitef)[4] == "Time (1st)"){
    colnames(sitef)[4] <- "Time"
  }
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Swimming", Event = event, Year = year, Gender = "Male" )
  rank1 <- rank1 %>% select (2,3,4,5,1) %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}

rank100mtre_fs <- rank1 %>% na.omit()

### ************************** 100_metre_freestyle Men ### ************************** 


### ************************** 200_metre_freestyle Men ### ************************** 

Men_master_200_fs <- Men_master %>% filter(event=="200_metre_freestyle")
n = count(Men_master_200_fs)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Men_master_200_fs$Year[i]
  event <- Men_master_200_fs$event[i]
  extractf <- Men_master_200_fs$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1900" ) ~ 10, 
    (year == "1984" | year == "1988" | year == "1992" | year == "1996" 
     | year == "2000" | year == "2004" | year == "2008" |  year == "2020") ~ 9,
    (year == "1968" | year == "1972" | year == "1976" ) ~ 7,
    (year == "1980" | year == "2012" | year == "2016") ~ 8, 
    TRUE ~ 6
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Swimming", Event = event, Year = year, Gender = "Male" )
  rank1 <- rank1 %>% select (2,3,4,5,1)  %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 9) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}

rank200mtre_fs <- rank1 %>% na.omit()

### ************************** 200_metre_freestyle Men ### ************************** 

### ************************** 400_metre_freestyle Men ### ************************** 

Men_master_400_fs <- Men_master %>% filter(event=="400_metre_freestyle")
n = count(Men_master_400_fs)$n
rm(rprev)
rm(rank1)
k=0
for (i in 1:n) {
  year <- Men_master_400_fs$Year[i]
  event <- Men_master_400_fs$event[i]
  extractf <- Men_master_400_fs$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1900" | year == "1964") ~ 10, (year == "1948") ~5, ( year == "2004" | year == "2008" | year == "2016" ) ~6, (year == "1912" | year == "1928") ~13, (year == "1920" | year == "1924" | year == "1936") ~12, (year == "1968" | year == "1972"   | year == "1980" | year =="1996" | year == "2000"| year == "2020" |year == "2012") ~ 7,(year == "1956" | year == "1960") ~9,
    ( year == "1984" | year == "1988" | year == "1992") ~ 8, (year == "1956" | year == "1976" | year == "1932"| year == "1952") ~11, (year == "1908") ~16,
    TRUE ~ 15
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Swimming", Event = event, Year = year, Gender = "Male" )
  rank1 <- rank1 %>% select (2,3,4,5,1)   %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}

rank400mtre_fs <- rank1 %>% na.omit()

### ************************** 400_metre_freestyle Men ### ************************** 

### ************************** 100_metre_backstroke Men ### ************************** 

Men_master_100_bckstroke <- Men_master %>% filter(event=="100_metre_backstroke")
n = count(Men_master_100_bckstroke)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Men_master_100_bckstroke$Year[i]
  event <- Men_master_100_bckstroke$event[i]
  extractf <- Men_master_100_bckstroke$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1980" | year=="1984"| year == "1996" | year =="2004" | year == "1920") ~7, (year == "1908") ~14,
    (year == "1988" | year =="1992"| year == "2020") ~ 8,
    (year == "1912" | year == "1924" | year == "1928" | year == "1968") ~ 12,
    (year == "1932" | year == "1952" | year == "1960" | year =="1972" | year == "1976") ~ 10,
    (year == "1936" | year == "1948" ) ~ 11,
    (year == "1956" | year == "2000"|year =="2008" | year == "2012"|year == "2016") ~9,
    TRUE ~ 6
  )

  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Swimming", Event = event, Year = year, Gender = "Male" )
  rank1 <- rank1 %>% select (2,3,4,5,1)    %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 9) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}
rank100mtre_backstroke <- rank1 %>% na.omit()

### ************************** 100_metre_backstroke Men Ends ### ************************** 


### ************************** 100_metre_breaststroke Men  ### ************************** 
Men_master_100_breaststroke<- Men_master %>% filter(event=="100_metre_breaststroke")
n = count(Men_master_100_breaststroke)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Men_master_100_breaststroke$Year[i]
  event <- Men_master_100_breaststroke$event[i]
  extractf <- Men_master_100_breaststroke$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1980") ~ 6,
    (year == "1984" | year =="1988"| year == "1992" | year =="1996" | year == "2020") ~ 8,
    (year == "1972") ~ 10,
    (year == "1968") ~ 11,
    (year == "1976" | year == "2000"|year =="2004" | year == "2008"|year == "2012" | year == "2016") ~9,
    TRUE ~ 6
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Swimming", Event = event, Year = year, Gender = "Male" )
  rank1 <- rank1 %>% select (2,3,4,5,1) %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}
rank100mtre_breaststroke <- rank1  %>% na.omit
### ************************** 100metre_breaststroke Men Ends ### ************************** 

### ************************** 100_metre_butterfly Men ### ************************** 
Men_master_100_metre_butterfly<- Men_master %>% filter(event=="100_metre_butterfly")
n = count(Men_master_100_metre_butterfly)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Men_master_100_metre_butterfly$Year[i]
  event <- Men_master_100_metre_butterfly$event[i]
  extractf <- Men_master_100_metre_butterfly$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1980" | year == "1992") ~7,(year == "1984" | year =="1988"| year == "1996" | year =="2012" ) ~ 8,
    (year == "1972" | year =="1976") ~ 10,(year == "1968") ~ 14,
    (year == "2000" | year == "2004"|year =="2008" | year == "2016"|year == "2020" ) ~9,
    TRUE ~ 6
  )
   sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Swimming", Event = event, Year = year, Gender = "Male" )
  rank1 <- rank1 %>% select (2,3,4,5,1)  %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 9) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}

rank100mtre_butterfly <- rank1 %>% na.omit
### ************************** 100_metre_butterfly Men Ends ### ************************** 

rm(Final_Men_RankingByTime)

Final_Men_RankingByTime <- bind_rows(rank50mtre_fs,
                                     rank100mtre_fs,
                                     rank200mtre_fs,
                                     rank400mtre_fs,
                                     rank100mtre_backstroke,
                                     rank100mtre_breaststroke,
                                     rank100mtre_butterfly)
head(Final_Men_RankingByTime)
## # A tibble: 6 × 7
##   Sport    Event              Year  Gender Time       Rank Timeinsec
##   <chr>    <chr>              <chr> <chr>  <chr>     <int>     <dbl>
## 1 Swimming 50_metre_freestyle 1988  Male   0.0.22.14     1      22.1
## 2 Swimming 50_metre_freestyle 1988  Male   0.0.22.36     2      22.4
## 3 Swimming 50_metre_freestyle 1988  Male   0.0.22.71     3      22.7
## 4 Swimming 50_metre_freestyle 1992  Male   0.0.21.91     1      21.9
## 5 Swimming 50_metre_freestyle 1992  Male   0.0.22.09     2      22.1
## 6 Swimming 50_metre_freestyle 1992  Male   0.0.22.3      3      22.3
url <- "https://en.wikipedia.org/wiki/Swimming_at_the_Summer_Olympics"

scraplinks <- function(url){
  # Create an html document from the url
  webpage <- xml2::read_html(url)
  # Extract the URLs
  url_ <- webpage %>%
    rvest::html_nodes("a") %>%
    rvest::html_attr("href")
  link_ <- webpage %>%
    rvest::html_nodes("a") %>%
    rvest::html_text()
  return(tibble(link = link_, url = url_))
  # Extract the link text
}

url_links <- scraplinks(url)
Women_master <- url_links %>% filter(str_detect(url,"_Women"))  %>% filter(str_detect(link,"X")) %>%
  filter(str_detect(url,"_sailors",TRUE)) %>%
  filter(str_detect(url,"_yard",TRUE)) %>% 
  filter(str_detect(url,"_mile",TRUE)) %>% 
  filter(str_detect(url,"_medley",TRUE)) %>% 
  filter(str_detect(url,"_relay",TRUE)) %>% 
  filter(str_detect(url,"_obstacle",TRUE)) %>% 
  filter(str_detect(url,"_underwater",TRUE)) %>% 
  filter(str_detect(url,"_marathon",TRUE)) %>%
  filter(str_detect(url,"_team",TRUE)) %>% select(1,2)
Women_master <- Women_master %>% mutate(Year = substr(url,23,26)) %>% select(3,2)
Women_master <- Women_master %>% mutate(url = paste("https://en.wikipedia.org",url,sep=""))
Women_master <- Women_master %>% mutate(event = word(url,9,end=11,sep='_')) %>% 
  mutate(event = gsub(",","",event)) %>% select(1,3,2)


### ************************** 50_metre_freestyle Women ### ************************** 

Women_master_50<- Women_master %>% filter(event=="50_metre_freestyle")
n = count(Women_master_50)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Women_master_50$Year[i]
  event <- Women_master_50$event[i]
  extractf <- Women_master_50$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "2000") ~ 10, (year == "2008" | year == "2020") ~9, (year == "2012") ~10, 
    (year == "1988" | year == "1992" | year == "2004" | year == "2016") ~ 8,(year == "1996") ~ 7,
    TRUE ~ 6
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Swimming", Event = event, Year = year, Gender = "Female" )
  rank1 <- rank1 %>% select (2,3,4,5,1) %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}

rank50mtre_fs_wn <- rank1 %>% na.omit()


### ************************** 50_metre_freestyle Women Ends ### ************************** 


### ************************** 100_metre_freestyle Women ### ************************** 

Women_master_100_fs <- Women_master %>% filter(event=="100_metre_freestyle")
n = count(Women_master_100_fs)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Women_master_100_fs$Year[i]
  event <- Women_master_100_fs$event[i]
    extractf <- Women_master_100_fs$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1948" ) ~ 6,
    (year == "1920" | year == "1992" | year == "1996") ~ 8, 
    (year == "1956" | year == "2000" | year == "2004" | year == "2008" | year == "2012" | year == "2016" ) ~ 9,
    (year == "1960" | year == "1980" | year == "1984" | year == "1988") ~ 7,
    (year == "2020") ~ 10, (year == "1936") ~ 12, (year == "1968") ~ 14, 
    (year == "1924" | year =="1932"  | year == "1952" | year == "1964" | year == "1972") ~ 11, 
    (year == "1976") ~ 13, 
    TRUE ~ 6
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Swimming", Event = event, Year = year, Gender = "Female" )
  rank1 <- rank1 %>% select (2,3,4,5,1)  %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}

rank100mtre_fs_wn <- rank1%>% na.omit()

## ****************************** 100_meter_freestyle Woment ends**********************

## ****************************** 200_meter_freestyle Woment **********************

Women_master_200_fs <- Women_master %>% filter(event=="200_metre_freestyle")
n = count(Women_master_200_fs)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Women_master_200_fs$Year[i]
  event <- Women_master_200_fs$event[i]
  extractf <- Women_master_200_fs$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1968" ) ~ 10,
    (year == "1976" | year == "2008" | year == "2012") ~ 9,
    (year == "1980" | year == "1984" | year == "1992" | year == "1996"| year == "2000") ~ 7,
    (year == "1972" | year == "1988" | year == "2004" | year == "2016" | year =="2020") ~ 8,
    TRUE ~ 6
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Swimming", Event = event, Year = year, Gender = "Female" )
  rank1 <- rank1 %>% select (2,3,4,5,1)   %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}
rank200mtre_fs_wn <- rank1%>% na.omit()

## ****************************** 200_meter_freestyle Women ends **********************

## ****************************** 400_meter_freestyle Women ends **********************

Women_master_400_fs <- Women_master %>% filter(event=="400_metre_freestyle")
n = count(Women_master_400_fs)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Women_master_400_fs$Year[i]
  event <- Women_master_400_fs$event[i]
  extractf <- Women_master_400_fs$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1932" | year =="1952") ~ 10, (year == "1924" | year =="1928" | year =="1936") ~11, (year == "1948") ~5,
    (year == "1968" | year == "1976" | year == "" | year == "" | year == "") ~ 9,
    (year == "1972" | year == "1980" | year == "1992" | year == "1996" 
     | year == "2008" | year == "2012" | year  == "2016" | year == "2020") ~ 7,
    (year == "1956" | year == "1964" | year == "1984" | year =="1988") ~ 8, 
    ( year == "1960" | year =="2000" | year == "2004") ~6,
    TRUE ~ 6
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Swimming", Event = event, Year = year, Gender = "Female" )
  rank1 <- rank1 %>% select (2,3,4,5,1)  %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}
rank400mtre_fs_wn <- rank1%>% na.omit()

## ****************************** 400_meter_freestyle Women ends **********************

## ****************************** 100_metre_backstroke Women  **********************
Women_master_100_bckstroke <- Women_master %>% filter(event=="100_metre_backstroke")
n = count(Women_master_100_bckstroke)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Women_master_100_bckstroke$Year[i]
  event <- Women_master_100_bckstroke$event[i]
  extractf <- Women_master_100_bckstroke$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1924" | year == "1932" | year == "1956" | year =="1960" | year == "1964" | year == "1980" | year =="1996") ~7, 
    (year == "1928" | year =="1984"| year == "1992" | year =="2016") ~ 8,
    (year == "1968") ~ 14, (year =="1948") ~5, (year == "1952") ~6,
    (year == "1936" | year == "1976"| year =="1988" | year == "2000"|year == "2004" 
      | year =="2008" | year == "1972" | year == "2012" | year == "2020") ~9,
    TRUE ~ 21
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Swimming", Event = event, Year = year, Gender = "Female" )
  rank1 <- rank1 %>% select (2,3,4,5,1) %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}
rank100mtrebackstroke_fs_wn <- rank1%>% na.omit()

## ****************************** 100_metre_backstroke Women ends **********************


## ****************************** 100_metre_breaststroke Women  **********************
Women_master_100_breaststroke<- Women_master %>% filter(event=="100_metre_breaststroke")
n = count(Women_master_100_breaststroke)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Women_master_100_breaststroke$Year[i]
  event <- Women_master_100_breaststroke$event[i]
  extractf <- Women_master_100_breaststroke$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1992" | year == "1980") ~ 7,
    (year == "1984" | year =="1988"| year == "1996" | year =="2000") ~ 8,
    (year == "1968" | year == "1976") ~ 10,
    (year == "1972"|year =="2004" | year == "2008"|year == "2012" | year == "2016"|year == "2020") ~9,
    TRUE ~ 5
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Swimming", Event = event, Year = year, Gender = "Female" )
  rank1 <- rank1 %>% select (2,3,4,5,1)  %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}
rank100mtrebreaststroke_fs_wn <- rank1%>% na.omit()

## ****************************** 100_metre_breaststroke Women ends **********************

## ****************************** 100_metre_butterfly Women ends **********************
Women_master_100_metre_butterfly<- Women_master %>% filter(event=="100_metre_butterfly")
n = count(Women_master_100_metre_butterfly)$n
rm(rprev)
rm(rank1)
for (i in 1:n) {
  year <- Women_master_100_metre_butterfly$Year[i]
  event <- Women_master_100_metre_butterfly$event[i]
  extractf <- Women_master_100_metre_butterfly$url[i] %>% read_html() %>% html_table(fill = TRUE)
  k <- case_when(
    (year == "1956" | year == "1980") ~6, (year == "1968") ~ 11, (year == "1960" | year =="1996") ~ 7,
    (year == "1964") ~ 12, (year == "1976") ~10, (year == "1984" | year == "1988" 
                                                  | year == "1992" | year == "2004" | year == "2008" | year == "2020") ~8,
    (year == "1972" | year == "2000"|year =="2012" | year == "2016"|year == " " ) ~9,
    TRUE ~ 11
  )
  sitef <- extractf[[k]]
  rank1 <- sitef %>% head(3) %>% select(Time)
  rank1 <- rank1 %>% mutate(Sport = "Swimming", Event = event, Year = year, Gender = "Female" )
  rank1 <- rank1 %>% select (2,3,4,5,1)  %>% mutate(Time = str_extract(Time,"[0-9':''.']+"))
  rank1 <- rank1 %>% mutate(Rank = seq.int(nrow(rank1)))
  if(str_detect(rank1$Time,":") == "FALSE"){
    rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  }
  rank1$Time <- paste("0.",trimws(rank1$Time),sep="")
  rank1$Time <- str_replace((rank1$Time),":",".")
  rank1 <- rank1 %>% mutate(Timeinsec = period_to_seconds(hms(Time)))
  if (i > 1) {
    rank1 <- rbind(rprev,rank1)
  }
  rprev <- rank1
}

rank100mtrebutterfly_fs_wn <- rank1%>% na.omit()
## ****************************** 100_metre_ butterfly Women ends **********************

rm(Final_Women_RankingByTime)
Final_Women_RankingByTime <-bind_rows(rank50mtre_fs_wn,
                                      rank100mtre_fs_wn,
                                      rank200mtre_fs_wn,
                                      rank400mtre_fs_wn,
                                      rank100mtrebackstroke_fs_wn,
                                      rank100mtrebreaststroke_fs_wn,
                                      rank100mtrebutterfly_fs_wn)

head(Final_Women_RankingByTime)
## # A tibble: 6 × 7
##   Sport    Event              Year  Gender Time       Rank Timeinsec
##   <chr>    <chr>              <chr> <chr>  <chr>     <int>     <dbl>
## 1 Swimming 50_metre_freestyle 1988  Female 0.0.25.49     1      25.5
## 2 Swimming 50_metre_freestyle 1988  Female 0.0.25.64     2      25.6
## 3 Swimming 50_metre_freestyle 1988  Female 0.0.25.71     3      25.7
## 4 Swimming 50_metre_freestyle 1992  Female 0.0.24.79     1      24.8
## 5 Swimming 50_metre_freestyle 1992  Female 0.0.25.08     2      25.1
## 6 Swimming 50_metre_freestyle 1992  Female 0.0.25.23     3      25.2

Combine women’s and men’s swimming files for plotting

Olympics_swimming <- Final_Men_RankingByTime %>%
  rbind(Final_Women_RankingByTime)
head(Olympics_swimming)
## # A tibble: 6 × 7
##   Sport    Event              Year  Gender Time       Rank Timeinsec
##   <chr>    <chr>              <chr> <chr>  <chr>     <int>     <dbl>
## 1 Swimming 50_metre_freestyle 1988  Male   0.0.22.14     1      22.1
## 2 Swimming 50_metre_freestyle 1988  Male   0.0.22.36     2      22.4
## 3 Swimming 50_metre_freestyle 1988  Male   0.0.22.71     3      22.7
## 4 Swimming 50_metre_freestyle 1992  Male   0.0.21.91     1      21.9
## 5 Swimming 50_metre_freestyle 1992  Male   0.0.22.09     2      22.1
## 6 Swimming 50_metre_freestyle 1992  Male   0.0.22.3      3      22.3

plot 100 metre freestyle

m100fs <- Olympics_swimming %>%
  filter(Event == "100_metre_freestyle") 
  m100fs$Sec = m100fs$Timeinsec
Olympic_perf_plot <- ggplot(data = m100fs, aes(Year, Sec, colour=Gender)) +
  coord_cartesian(ylim = c(30,70)) +
  geom_point() +
  ggtitle("100 metres Freestyle: Olympic Women Swimming Performance vs. Men") +
  geom_smooth()
Olympic_perf_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

plot 100 metre Butterfly

m100bf <- Olympics_swimming %>%
  filter(Event == "100_metre_butterfly") 
  m100bf$Sec = m100bf$Timeinsec
Olympic_perf_plot <- ggplot(data = m100bf, aes(Year, Sec, colour=Gender)) +
  coord_cartesian(ylim = c(30,70)) +
  geom_point() +
  ggtitle("100 metres Butterfly: Olympic Women Swimming Performance vs. Men") +
  geom_smooth()
Olympic_perf_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

plot 400 metre Freestyle

m400fs <- Olympics_swimming %>%
  filter(Event == "400_metre_freestyle") 
  m400fs$Sec = m400fs$Timeinsec
Olympic_perf_plot <- ggplot(data = m400fs, aes(Year, Sec, colour=Gender)) +
  coord_cartesian(ylim = c(30,500)) +
  geom_point() +
  ggtitle("400 metres Freestyle: Olympic Women Swimming Performance vs. Men") +
  geom_smooth()
Olympic_perf_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

plot 100 metre Backstroke

m100bs <- Olympics_swimming %>%
  filter(Event == "100_metre_backstroke") 
  m100bs$Sec = m100bs$Timeinsec
Olympic_perf_plot <- ggplot(data = m100bs, aes(Year, Sec, colour=Gender)) +
  coord_cartesian(ylim = c(30,100)) +
  geom_point() +
  ggtitle("100 metres Backstroke") +
  geom_smooth()
Olympic_perf_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'